How to get rid of the bad python code


In [19]:
import json
import datetime
import tqdm



folder = '/dfs/scratch2/fcipollone/stackoverflow/guesslang_and_ast/outfiles'

lines = []

guess_and_parse = {}
guess_not_parse = {}
parse_not_guess = {}
total = {}
dates = []
for file_num in tqdm.tqdm(range(400)):
    filename = folder + '/file' + str(file_num) + '.txt'
    for line in open(filename):
        line_obj = json.loads(line)
        for code_block in line_obj['CodeBlocks']:
            l = int(round(len(code_block['code']),-1))
            if l > 10000:
                l = 10000
            gl = code_block['Guesslang']
            par = code_block['Parsable']
            if par == "True" or gl.strip().lower() == "python":
                if l not in total:
                    total[l] = []
                total[l].append(code_block['code'])
            if par == "True" and gl.strip().lower() == "python":
                if l not in guess_and_parse:
                    guess_and_parse[l] = []
                guess_and_parse[l].append(code_block['code'])
                
            elif par == "True" and not gl.strip().lower() == "python":
                if l not in parse_not_guess:
                    parse_not_guess[l] = []
                parse_not_guess[l].append(code_block['code'])
                
            elif not par == "True" and gl.strip().lower() == "python":
                if l not in guess_not_parse:
                    guess_not_parse[l] = []
                guess_not_parse[l].append(code_block['code'])


100%|██████████| 400/400 [01:23<00:00,  4.80it/s]

In [22]:
import random
desired_code_length = round(100,-1)
'''
for i in range(50):
    print('Guess AND parse')
    print('Number to choose from',len(guess_and_parse[desired_code_length]))
    print('-'*5)
    print(random.sample(guess_and_parse[desired_code_length], 1)[0])
    print('*'*100)
#print('Guess NOT parse')
'''

for i in range(50):
    print('Number to choose from',len(guess_not_parse[desired_code_length]))
    print('-'*5)
    print(random.sample(guess_not_parse[desired_code_length], 1)[0])
    print('*'*100)
#    print('Parse NOT Guess')
#    print('Number to choose from',len(parse_not_guess[desired_code_length]))
#    print('-'*5)
#    print(random.sample(parse_not_guess[desired_code_length], 1)[0])
#    print('*'*100)


Number to choose from 23890
-----
    webBrowser1.Document.Write("<BODY background= D:\\Desktop\\123.jpg bgColor=#ffffff text=#000000>");

****************************************************************************************************
Number to choose from 23890
-----
update a
set a.company_id = b.company_id
from vendorRegkeys a, users b
where a.createdby_id = b.user_id

****************************************************************************************************
Number to choose from 23890
-----
In []: p= 2* rand(3, 1e4)- 1
In []: p= p[:, sum(p* p, 0)** .5<= 1]
In []: p.shape
Out[]: (3, 5216)

****************************************************************************************************
Number to choose from 23890
-----
jQuery('script').remove()
jQuery('noscript').remove()
jQuery('body').text().replace(/\s{2,9999}/g, ' ')

****************************************************************************************************
Number to choose from 23890
-----
  sum1 = 0
  for i = 1:K
  sum1 = sum1 + Y(k,i) *log(Htheta(k)) + (1 - Y(k,i))*log(1-Htheta(k))

****************************************************************************************************
Number to choose from 23890
-----
cannot select option, no option with text '20120905' in select box 'date' (Capybara::ElementNotFound)

****************************************************************************************************
Number to choose from 23890
-----
return this.patientRepository.FindAll(spc).OrderBy(a => a.Id).Skip(start).Take(limit).ToList();

****************************************************************************************************
Number to choose from 23890
-----
Dim lst1 as new list(of integer)
Dim lst2 as new list(of integer)
Dim lst3 as new list(of integer)

****************************************************************************************************
Number to choose from 23890
-----
ssh user@host 'cat - > /tmp/file.ext; do_something_with /tmp/file.ext;rm /tmp/file.ext' < file.ext 

****************************************************************************************************
Number to choose from 23890
-----
Mockito.when(nrClient.uploadFiles("DF49ACBC8", Matchers.anyList(), "dl"))
       .thenReturn("");

****************************************************************************************************
Number to choose from 23890
-----
var s = "";
for ( x in { 3:3, 1:1 } ) { s += x }
if ( s === "31" ) alert( 'JSC' )
else alert( 'V8' )

****************************************************************************************************
Number to choose from 23890
-----
Warning: ftp_put() [function.ftp-put]: Filename invalid in D:\xampp\htdocs\mycloud\edit.php on line 7

****************************************************************************************************
Number to choose from 23890
-----
if (arr[item] == 'a' || arr[item] == 'e' || arr[item] == 'i' || arr[item] == 'o' || arr[item] == 'u')

****************************************************************************************************
Number to choose from 23890
-----
Info := 'destination=' + UrlEncode(EmailDestAddressFromIni) +
  '&' + 'Nicebody=' + UrlEncode(Nicebody);

****************************************************************************************************
Number to choose from 23890
-----
select t1.* 
from table1 t1 join table2 t2 on t2.table1ID=t1.id
group by t1.id
having count(*)<2

****************************************************************************************************
Number to choose from 23890
-----
document
  .getElementById("image_chk")
  .style.backgroundImage="url( 'img/"+ clicked_id+".jpg' )";

****************************************************************************************************
Number to choose from 23890
-----
def foo(a):
    def bar(a):
        a -= 1
        return a
    return bar(a)
>>> print foo(5) 
4

****************************************************************************************************
Number to choose from 23890
-----
puts res.inspect # show the nested bits via  stdout
                 # or through some other logging

****************************************************************************************************
Number to choose from 23890
-----
def mapper(k,v_list):
  for v in v_list:
    if criteria:
      write to HDFS
    else:
      emit

****************************************************************************************************
Number to choose from 23890
-----
rails g teacher name age:integer email sex course 
#replaced class with course, now its working great.

****************************************************************************************************
Number to choose from 23890
-----
@Html.HiddenFor(x => x.StartDate)  // if you don't want to display it or use the display for to show it

****************************************************************************************************
Number to choose from 23890
-----
$("#content").wrap("<table>")

    or with attribute

$("#content").wrap("<table id='wrapper'>")

****************************************************************************************************
Number to choose from 23890
-----
>>> import Tkinter #tkinter
>>> root = Tkinter.Tk()
>>> root.tk.eval('package require Tkhtml')
'0.0'

****************************************************************************************************
Number to choose from 23890
-----
from xx in table
where (from yy in string[] 
       select yy).Contains(xx.uid.ToString())
select xx

****************************************************************************************************
Number to choose from 23890
-----
select top 6 Client_Country, count(*) Total
from table group by Client_Country
order by total desc

****************************************************************************************************
Number to choose from 23890
-----
... inner join Containstable (fulltextTable, mycolumn, ?) as KeyTable on id = KeyTable.[KEY] ...

****************************************************************************************************
Number to choose from 23890
-----
frame.setLocation(0,0) is top left.
frame.setLocation(0,700) moves it as close as i can to the bottom

****************************************************************************************************
Number to choose from 23890
-----
def getmonthname:
    months = ['January', 'February', ... , 'December']
    return months[month - 1]

****************************************************************************************************
Number to choose from 23890
-----
for (i in 1:(n-1))
  for (j in (i+1):n)
    cat(sprintf("(%g,%g)\n", i, j))
## (1,2)
## (1,3)
## (2,3)

****************************************************************************************************
Number to choose from 23890
-----
Npgsql.EF6, version 2.0.12-pre4(Prerelease)
Npgsql, version 2.0.14.3
EntityFramework, version 6.0.2

****************************************************************************************************
Number to choose from 23890
-----

>>> p = re.compile('.*', re.DEBUG)
max_repeat 0 65535
  any None
>>>                         

****************************************************************************************************
Number to choose from 23890
-----
#set ($themeDisplay = $httpServletRequest.getAttribute("THEME_DISPLAY"))
$themeDisplay.isSignedIn()

****************************************************************************************************
Number to choose from 23890
-----
undefined method `serial_number' for #<User:0x000000060b1d40>
extracted source (around line #8):
...

****************************************************************************************************
Number to choose from 23890
-----
t1 = Time.now
t2 = Time.at(t1.to_i)
puts t1 == t2    # Says False
puts t1.eql?(t2) # Says False

****************************************************************************************************
Number to choose from 23890
-----
cell.backgroundColor = UIColor.colorWithRed(125/255.0, green: 125/255.0, blue: 125/255.0, alpha: 1.0)  

****************************************************************************************************
Number to choose from 23890
-----
org.jdesktop.jdic.init.JdicInitException: java.lang.UnsatisfiedLinkError: no jdic in java.library.path

****************************************************************************************************
Number to choose from 23890
-----
View a report: Run report with html output -> save output to /tmp/abcd/ -> embedd it into your webapp.

****************************************************************************************************
Number to choose from 23890
-----
; file one 
10     0.2   0.5   0.3
20     0.1   0.6   0.8
30     0.2   0.1   0.1
40     0.1   0.5   0.3

****************************************************************************************************
Number to choose from 23890
-----
if (last row is selected)
    add a new row to the table

invoke the default down arrow action

****************************************************************************************************
Number to choose from 23890
-----

>>> p = re.compile('.*', re.DEBUG)
max_repeat 0 65535
  any None
>>>                         

****************************************************************************************************
Number to choose from 23890
-----
new_list = [f1, f2, datetime.datetime.strptime(f3, '%m/%d/%Y').date()
    for f1, f2, f3 in old_list]

****************************************************************************************************
Number to choose from 23890
-----
    correctmsg.ShowMsg
    ' this was hiding the form as soon as it shows:
    'correctmsg.Hide()

****************************************************************************************************
Number to choose from 23890
-----
ExpandoMetaClass.enableGlobally()
Integer.metaClass.gimmeAP = {->return 'p'}
assert 3.gimmeAP() == 'p'

****************************************************************************************************
Number to choose from 23890
-----
sed -i".bak" "s:'export LD_PRELOAD="/usr/lib/libopenvg.so /usr/lib/libinterposer.so"'::" ~/.bashrc

****************************************************************************************************
Number to choose from 23890
-----
@(Html.Telerik().DatePicker()
    .Name("DataField")
    .ShowButton(true)
    .Value(Model.DataField))

****************************************************************************************************
Number to choose from 23890
-----
UPDATE omc_product
SET image = CONCAT('assets/', image), thumbnail = CONCAT('assets/', thumbnail)

****************************************************************************************************
Number to choose from 23890
-----
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
****************************************************************************************************
Number to choose from 23890
-----
import csv

data = csv.reader(open('c:\x\list.csv' ))

for row in data:

    print(row)

print('ready')

****************************************************************************************************
Number to choose from 23890
-----
normalization_factor = float(sum(p.values()))
for key, value in p:
p[key] = value/normalization_factor

****************************************************************************************************
Number to choose from 23890
-----
to merge all the changes applied on it (assume there is no conflict) without opening the docx file.
****************************************************************************************************

Handlabeling code


In [15]:
# These are the actual counts for each category (true-true, true-false, false-true)
#For 100:
#48, 8, 16
#For 500:
#50, 19, 12
#For 1000:
#50, 13, 7

In [16]:
# These are the percentages for each category for convenience (true-true, true-false, false-true)
#For 100:
#96, 16, 32
#For 500:
#100, 38, 24
#For 1000:
#100, 26, 14

Analysis

Given the above percentages, we can calculate how many of the code blocks we think are python, and how many of the python code blocks are captured by the first category (true-true)


In [1]:
# Number of code blocks per category
python_true = 1695196
python_false =  1471445
other_false = 2234022

In [13]:
# Taking the min probability of each group, and the max probability of each group to get a possible range
max_python = python_true*1 + python_false*.38 + other_false*.32
min_python = python_true*.96 + python_false*.16 + other_false*.14

In [10]:
print(max_python)
print(min_python)


2969232.14
2175582.44

In [11]:
# Taking the ratio of python code to total code -- about half of it isnt python at all!
print(max_python / (python_true+python_false+other_false))
print(min_python / (python_true+python_false+other_false))


0.5497903016722206
0.40283617770632973

In [14]:
# Taking the ratio of python code captured by the first group, and total estimated python code
print(python_true / max_python)
print(python_true / min_python)


0.570920669072375
0.7791918011619914

In [17]:
# Conclusion:
# Although this method of estimation may be a bit crude, it gives us a range that I'm pretty sure about, and I would
# estimate that the true ratio of python code captured is around 65% or more

In [24]:
# Qualitative information
# python - false (this is when guesslang says python but it does not parse)
#2  JSON
#17 Other programming languages
#18 python 
#3  ipython shell
#10 trace 

# other - true (this is when it parses but guesslang says something other than python)
#34 JSON
#9  python
#4  comments
#3  other programming languages

In [ ]: